1 Data Manipulation

library(tidyverse)
library(nycflights13)
library(showtext)

font_add_google("Montserrat") # the closest to LBS font-- can get more fonts from fonts.google.com
showtext_auto(enable = TRUE)

2 months with highest and lowest proportion of cancelled flights

flights %>%
  mutate(cancelled = ifelse(is.na(dep_time), TRUE, FALSE)) %>% 
  group_by(month, cancelled) %>% 
  summarise(count = n()) %>% 
  mutate(prop = count / sum(count))
## # A tibble: 24 × 4
## # Groups:   month [12]
##    month cancelled count   prop
##    <int> <lgl>     <int>  <dbl>
##  1     1 FALSE     26483 0.981 
##  2     1 TRUE        521 0.0193
##  3     2 FALSE     23690 0.949 
##  4     2 TRUE       1261 0.0505
##  5     3 FALSE     27973 0.970 
##  6     3 TRUE        861 0.0299
##  7     4 FALSE     27662 0.976 
##  8     4 TRUE        668 0.0236
##  9     5 FALSE     28233 0.980 
## 10     5 TRUE        563 0.0196
## # ℹ 14 more rows
# does it matter how we group_by()? Of course it does!

flights %>%
  mutate(cancelled = ifelse(is.na(dep_time), TRUE, FALSE)) %>% 
  group_by(cancelled, month) %>% 
  summarise(count = n()) %>% 
  mutate(prop = count / sum(count))  
## # A tibble: 24 × 4
## # Groups:   cancelled [2]
##    cancelled month count   prop
##    <lgl>     <int> <int>  <dbl>
##  1 FALSE         1 26483 0.0806
##  2 FALSE         2 23690 0.0721
##  3 FALSE         3 27973 0.0851
##  4 FALSE         4 27662 0.0842
##  5 FALSE         5 28233 0.0859
##  6 FALSE         6 27234 0.0829
##  7 FALSE         7 28485 0.0867
##  8 FALSE         8 28841 0.0878
##  9 FALSE         9 27122 0.0826
## 10 FALSE        10 28653 0.0872
## # ℹ 14 more rows

Plot cancellation % by month

flights %>%
  # find flights that were cancelled, ie, those with no departure time
  mutate(cancelled = ifelse(is.na(dep_time), TRUE, FALSE)) %>% 
  
  # count how many cancellations/normal flights we had per month
  group_by(month, cancelled) %>% 
  summarise(count = n()) %>% 
  
  # use mutate to create a new column that calculates %
  mutate(prop = count / sum(count)) %>%
  
  # only show those flights that were cancelled
  filter(cancelled == TRUE) %>%
  
  # pipe the resulting dataframe to ggpplot
  ggplot() +
  
  # add global aesthetics
  aes(x=factor(month), y = prop)+
  
  #just use a bar chart
  geom_col()+
  
  # change the theme, to theme_light()
  theme_light()+
  
  # format y-axis labels as, e.g., 15% and not 0.15
  scale_y_continuous(labels = scales::percent)

# how about we do it by airport?

flights %>%
  mutate(cancelled = ifelse(is.na(dep_time), TRUE, FALSE)) %>% 
  group_by(origin, month, cancelled) %>% 
  summarise(count = n()) %>% 
  mutate(prop = count / sum(count)) %>% 
  filter(cancelled == TRUE) %>% 
  ggplot() +
  aes(x=factor(month), y = prop)+
  geom_col()+
  theme_light()+
  scale_y_continuous(labels = scales::percent)+
  facet_wrap(~origin, ncol = 1)

3 Problem 3: What plane (specified by tailnum variable) traveled the most times from NYC airports in 2013?

Please left_join() the resulting table with the table planes (also included in the nycflights13 package).

glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
glimpse(planes)
## Rows: 3,322
## Columns: 9
## $ tailnum      <chr> "N10156", "N102UW", "N103US", "N104UW", "N10575", "N105UW…
## $ year         <int> 2004, 1998, 1999, 1999, 2002, 1999, 1999, 1999, 1999, 199…
## $ type         <chr> "Fixed wing multi engine", "Fixed wing multi engine", "Fi…
## $ manufacturer <chr> "EMBRAER", "AIRBUS INDUSTRIE", "AIRBUS INDUSTRIE", "AIRBU…
## $ model        <chr> "EMB-145XR", "A320-214", "A320-214", "A320-214", "EMB-145…
## $ engines      <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ seats        <int> 55, 182, 182, 182, 55, 182, 182, 182, 182, 182, 55, 55, 5…
## $ speed        <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ engine       <chr> "Turbo-fan", "Turbo-fan", "Turbo-fan", "Turbo-fan", "Turb…
flights %>% 
  
  # left join table flights with planes, that has all the info on airplanes 
  left_join(planes, by="tailnum") %>% 
  
  # for a single variable, count() is the same as group_by() and then summarise()
  # if we add `sort=TRUE`, we get the table sorted in descending order (max to min)
  count(tailnum, sort=TRUE)
## # A tibble: 4,044 × 2
##    tailnum     n
##    <chr>   <int>
##  1 <NA>     2512
##  2 N725MQ    575
##  3 N722MQ    513
##  4 N723MQ    507
##  5 N711MQ    486
##  6 N713MQ    483
##  7 N258JB    427
##  8 N298JB    407
##  9 N353JB    404
## 10 N351JB    402
## # ℹ 4,034 more rows

4 For the plane with the greatest number of flights and that had more than 50 seats, please create a table where it flew to during 2013.

which_plane <- flights %>% 
  left_join(planes, by="tailnum") %>% 
  filter(seats >= 50) %>% 
  count(tailnum, sort = TRUE) %>% 
  select(tailnum) %>% 
  
  # slice(1) will pull the first, highest number of flights
  slice(1) %>% 
  
  # pull() creates a vector; if we dont use pull() we get a dataframe
  pull()

which_plane
## [1] "N328AA"
flights %>% 
  filter(tailnum == which_plane) %>% 
  count(origin, dest, sort=TRUE)
## # A tibble: 6 × 3
##   origin dest      n
##   <chr>  <chr> <int>
## 1 JFK    LAX     313
## 2 JFK    SFO      52
## 3 JFK    MIA      25
## 4 JFK    BOS       1
## 5 JFK    MCO       1
## 6 JFK    SJU       1

Problem 4: The nycflights13 package includes a table (weather) that describes the weather during 2013. Use that table to answer the following questions:

5 What is the distribution of temperature (temp) in July 2013? Identify any important outliers in terms of the wind_speed

variable.

weather %>% 
  filter(month == 7) %>% 
  ggplot() +
  aes(x= temp)+
  geom_histogram()

# looking at all months
weather %>% 
 # filter(month == 7) %>% 
  ggplot() +
  aes(x= temp)+
  geom_histogram() +
  facet_wrap(~month, nrow=4)+
  theme_light()

# facet_wrap() for all months' wind_speed
weather %>% 
  ggplot() +
  aes(x= wind_speed)+
  geom_histogram()+
  
  facet_wrap(~month, 
             ncol = 3, # to get 3 columns
             scales = "free")+ # easy to check outliers, by having ggplot automatically adjust scales on axes
  
  theme_light()+
  theme(text=element_text(size=12, family="Montserrat"))

6 Weather correlations

  • What is the relationship between dewp and humid?
  • What is the relationship between precip and visib?
library(GGally) # package to plot scatterplot- correlation matrix
glimpse(weather)
## Rows: 26,115
## Columns: 15
## $ origin     <chr> "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EW…
## $ year       <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,…
## $ month      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ day        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ hour       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, …
## $ temp       <dbl> 39.0, 39.0, 39.0, 39.9, 39.0, 37.9, 39.0, 39.9, 39.9, 41.0,…
## $ dewp       <dbl> 26.06, 26.96, 28.04, 28.04, 28.04, 28.04, 28.04, 28.04, 28.…
## $ humid      <dbl> 59.4, 61.6, 64.4, 62.2, 64.4, 67.2, 64.4, 62.2, 62.2, 59.6,…
## $ wind_dir   <dbl> 270, 250, 240, 250, 260, 240, 240, 250, 260, 260, 260, 330,…
## $ wind_speed <dbl> 10.36, 8.06, 11.51, 12.66, 12.66, 11.51, 14.96, 10.36, 14.9…
## $ wind_gust  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 20.…
## $ precip     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ pressure   <dbl> 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1013, 1012,…
## $ visib      <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,…
## $ time_hour  <dttm> 2013-01-01 01:00:00, 2013-01-01 02:00:00, 2013-01-01 03:00…
weather %>% 
  select(dewp, humid, precip, visib, temp) %>% 
  ggpairs()+
  theme_bw()

Problem 5: Use the flights and planes tables to answer the following questions:

7 How many planes have a missing date of manufacture?

# to get the dataframe
planes %>% 
  filter(is.na(year))
## # A tibble: 70 × 9
##    tailnum  year type              manufacturer model engines seats speed engine
##    <chr>   <int> <chr>             <chr>        <chr>   <int> <int> <int> <chr> 
##  1 N14558     NA Fixed wing multi… EMBRAER      EMB-…       2    55    NA Turbo…
##  2 N15555     NA Fixed wing multi… EMBRAER      EMB-…       2    55    NA Turbo…
##  3 N15574     NA Fixed wing multi… EMBRAER      EMB-…       2    55    NA Turbo…
##  4 N174US     NA Fixed wing multi… AIRBUS INDU… A321…       2   199    NA Turbo…
##  5 N177US     NA Fixed wing multi… AIRBUS INDU… A321…       2   199    NA Turbo…
##  6 N181UW     NA Fixed wing multi… AIRBUS INDU… A321…       2   199    NA Turbo…
##  7 N18557     NA Fixed wing multi… EMBRAER      EMB-…       2    55    NA Turbo…
##  8 N194UW     NA Fixed wing multi… AIRBUS       A321…       2   199    NA Turbo…
##  9 N238JB     NA Fixed wing multi… EMBRAER      ERJ …       2    20    NA Turbo…
## 10 N271LV     NA Fixed wing multi… BOEING       737-…       2   149    NA Turbo…
## # ℹ 60 more rows
# to get counts
planes %>% 
  count(is.na(year))
## # A tibble: 2 × 2
##   `is.na(year)`     n
##   <lgl>         <int>
## 1 FALSE          3252
## 2 TRUE             70

8 What are the five most common manufacturers?

(Hint: you may need to use case_when() to recode the manufacturer name and collapse rare vendors into a category called Other.)

planes %>% 
  count(manufacturer, sort = TRUE)
## # A tibble: 35 × 2
##    manufacturer                      n
##    <chr>                         <int>
##  1 BOEING                         1630
##  2 AIRBUS INDUSTRIE                400
##  3 BOMBARDIER INC                  368
##  4 AIRBUS                          336
##  5 EMBRAER                         299
##  6 MCDONNELL DOUGLAS               120
##  7 MCDONNELL DOUGLAS AIRCRAFT CO   103
##  8 MCDONNELL DOUGLAS CORPORATION    14
##  9 CANADAIR                          9
## 10 CESSNA                            9
## # ℹ 25 more rows
# recode the manufacturer name and collapse rare vendors into a category called `Other`)

planes <- planes %>% 
  mutate(recode_manufacturer  = case_when( # case_when() is an elegant, multiple ifelse()
    manufacturer %in% c("BOEING") ~ "Boeing",
    manufacturer %in% c("AIRBUS INDUSTRIE", "AIRBUS") ~ "Airbus",
    manufacturer %in% c("EMBRAER") ~ "Embraer",
    manufacturer %in% c("MCDONNELL DOUGLAS", "MCDONNELL DOUGLAS AIRCRAFT CO", "MCDONNELL DOUGLAS CORPORATION" ) ~ "McDonnell Douglas",
    TRUE ~ "Other" # everything else will be "Other"
  )) 

9 Has the distribution of manufacturer changed over time as reflected by the airplanes flying from NYC in 2013?

flights %>% 
  left_join(planes, by = "tailnum") %>% 
  filter(!is.na(recode_manufacturer)) %>% 
  group_by(month, recode_manufacturer) %>% 
  summarise(n = n()) %>% 
  mutate(prop = n/sum(n)) %>% 
  ggplot()+
  aes(x=factor(month), 
      y = prop, 
      group = recode_manufacturer, 
      colour = recode_manufacturer)+
  geom_line()+
  theme_light()+
  theme(text=element_text(size=12, family="Montserrat"))+
  scale_y_continuous(labels = scales::percent)+
  NULL

Problem 6: Use the flights and planes tables to answer the following questions:

10 What is the oldest plane (specified by tailnum) that flew from NYC in 2013?

flights %>% 
  left_join(planes, by = "tailnum") %>%
  rename(year_manufactured = year.y) %>% 
  select(tailnum, year_manufactured) %>% 
  distinct(tailnum, .keep_all = TRUE) %>% 
  arrange(year_manufactured)
## # A tibble: 4,044 × 2
##    tailnum year_manufactured
##    <chr>               <int>
##  1 N381AA               1956
##  2 N201AA               1959
##  3 N567AA               1959
##  4 N575AA               1963
##  5 N378AA               1963
##  6 N14629               1965
##  7 N615AA               1967
##  8 N425AA               1968
##  9 N383AA               1972
## 10 N364AA               1973
## # ℹ 4,034 more rows

11 How many airplanes that flew from New York City are included in the planes table?

flights %>% 
  distinct(tailnum) %>%
  # semi_join() return all rows from x with a match in y.
  semi_join(planes)
## # A tibble: 3,322 × 1
##    tailnum
##    <chr>  
##  1 N14228 
##  2 N24211 
##  3 N619AA 
##  4 N804JB 
##  5 N668DN 
##  6 N39463 
##  7 N516JB 
##  8 N829AS 
##  9 N593JB 
## 10 N793JB 
## # ℹ 3,312 more rows
# quicker. how many palnes from our `planes` table appear in `flights`  
planes %>% 
  semi_join(flights, by="tailnum")
## # A tibble: 3,322 × 10
##    tailnum  year type              manufacturer model engines seats speed engine
##    <chr>   <int> <chr>             <chr>        <chr>   <int> <int> <int> <chr> 
##  1 N10156   2004 Fixed wing multi… EMBRAER      EMB-…       2    55    NA Turbo…
##  2 N102UW   1998 Fixed wing multi… AIRBUS INDU… A320…       2   182    NA Turbo…
##  3 N103US   1999 Fixed wing multi… AIRBUS INDU… A320…       2   182    NA Turbo…
##  4 N104UW   1999 Fixed wing multi… AIRBUS INDU… A320…       2   182    NA Turbo…
##  5 N10575   2002 Fixed wing multi… EMBRAER      EMB-…       2    55    NA Turbo…
##  6 N105UW   1999 Fixed wing multi… AIRBUS INDU… A320…       2   182    NA Turbo…
##  7 N107US   1999 Fixed wing multi… AIRBUS INDU… A320…       2   182    NA Turbo…
##  8 N108UW   1999 Fixed wing multi… AIRBUS INDU… A320…       2   182    NA Turbo…
##  9 N109UW   1999 Fixed wing multi… AIRBUS INDU… A320…       2   182    NA Turbo…
## 10 N110UW   1999 Fixed wing multi… AIRBUS INDU… A320…       2   182    NA Turbo…
## # ℹ 3,312 more rows
## # ℹ 1 more variable: recode_manufacturer <chr>
  # An inner_join() only keeps observations from x that have a matching key in y.
flights %>% 
  distinct(tailnum) %>%  
  inner_join(planes)
## # A tibble: 3,322 × 10
##    tailnum  year type              manufacturer model engines seats speed engine
##    <chr>   <int> <chr>             <chr>        <chr>   <int> <int> <int> <chr> 
##  1 N14228   1999 Fixed wing multi… BOEING       737-…       2   149    NA Turbo…
##  2 N24211   1998 Fixed wing multi… BOEING       737-…       2   149    NA Turbo…
##  3 N619AA   1990 Fixed wing multi… BOEING       757-…       2   178    NA Turbo…
##  4 N804JB   2012 Fixed wing multi… AIRBUS       A320…       2   200    NA Turbo…
##  5 N668DN   1991 Fixed wing multi… BOEING       757-…       2   178    NA Turbo…
##  6 N39463   2012 Fixed wing multi… BOEING       737-…       2   191    NA Turbo…
##  7 N516JB   2000 Fixed wing multi… AIRBUS INDU… A320…       2   200    NA Turbo…
##  8 N829AS   1998 Fixed wing multi… CANADAIR     CL-6…       2    55    NA Turbo…
##  9 N593JB   2004 Fixed wing multi… AIRBUS       A320…       2   200    NA Turbo…
## 10 N793JB   2011 Fixed wing multi… AIRBUS       A320…       2   200    NA Turbo…
## # ℹ 3,312 more rows
## # ℹ 1 more variable: recode_manufacturer <chr>

Problem 7: Use the nycflights13 to answer the following questions:

12 median arrival delay on a month-by-airport basis

flights %>% 
  group_by(month, origin) %>% 
  summarise(mean_arrival_delay = mean(arr_delay, na.rm=TRUE),
            median_arrival_delay = median(arr_delay, na.rm=TRUE)
            )
## # A tibble: 36 × 4
## # Groups:   month [12]
##    month origin mean_arrival_delay median_arrival_delay
##    <int> <chr>               <dbl>                <dbl>
##  1     1 EWR                 12.8                     0
##  2     1 JFK                  1.37                   -7
##  3     1 LGA                  3.38                   -4
##  4     2 EWR                  8.78                   -2
##  5     2 JFK                  4.39                   -5
##  6     2 LGA                  3.15                   -4
##  7     3 EWR                 10.6                    -4
##  8     3 JFK                  2.58                   -7
##  9     3 LGA                  3.74                   -7
## 10     4 EWR                 14.1                    -1
## # ℹ 26 more rows

13 plot median arrival delay for each month and origin airport.

flights %>% 
  group_by(carrier, month, origin) %>% 
  summarise(mean_arrival_delay = mean(arr_delay, na.rm=TRUE),
            median_arrival_delay = median(arr_delay, na.rm=TRUE)
            ) %>% 
  left_join(airlines, by = "carrier") %>% 
  rename(carrier_name = name) %>% 
  ggplot()+
  aes(x= factor(month),
      y = median_arrival_delay,
      colour = origin, 
      group = origin) +
  geom_line()+
  facet_wrap(~carrier_name, scales = 'free', ncol=3)+
  theme_light() +
    theme(
    text=element_text(size=10, family="Montserrat"))+
  labs(colour = "Origin Airport")

Problem 8: Let’s take a closer look at what carriers service the route to San Francisco International (SFO). Join the flights and airlines tables and count which airlines flew the most to SFO. Produce a new dataframe, fly_into_sfo that contains three variables: the name of the airline, e.g., United Air Lines Inc. not UA, the count (number) of times it flew to SFO, and the percent of the trips that that particular airline flew to SFO.

14 fly_into_sfo

fly_into_sfo <- flights %>% 
  left_join(airlines, by = "carrier") %>% 
  filter(dest == "SFO") %>% 
  group_by(name) %>% 
  summarise(count = n()) %>% 
  mutate(percent = round(100*count/sum(count),1))

fly_into_sfo
## # A tibble: 5 × 3
##   name                   count percent
##   <chr>                  <int>   <dbl>
## 1 American Airlines Inc.  1422    10.7
## 2 Delta Air Lines Inc.    1858    13.9
## 3 JetBlue Airways         1035     7.8
## 4 United Air Lines Inc.   6819    51.2
## 5 Virgin America          2197    16.5
# And here is some bonus ggplot code to plot your dataframe
fly_into_sfo %>% 
  
  # sort 'name' of airline by the numbers it times to flew to SFO
  mutate(name = fct_reorder(name, count)) %>% 
  
  ggplot() +
  
  aes(x = count, 
      y = name) +
  
  # a simple bar/column plot
  geom_col() +
  
  # add labels, so each bar shows the % of total flights 
  geom_text(aes(label = percent),
             hjust = 1, 
             colour = "white", 
             size = 5)+
  
  # add labels to help our audience  
  labs(title="United (UA) dominates the NYC to SFO route", 
       subtitle = "as % of total flights in 2013",
       x= "Number of flights",
       y= NULL) +
  
  theme_minimal() + 
  
  # change the theme-- i just googled those , but you can use the ggThemeAssist add-in
  # https://cran.r-project.org/web/packages/ggThemeAssist/index.html
  
  theme(#
    # so title is left-aligned
    plot.title.position = "plot",
    
    # text in axes appears larger        
    axis.text = element_text(size=12),
    
    # title text is bigger
    plot.title = element_text(size=18),
    
    text=element_text(size=12, family="Montserrat")
      ) +

  # add one final layer of NULL, so if you comment out any lines
  # you never end up with a hanging `+` that awaits another ggplot layer
  NULL

15 Hollywood Age Gap

How would you explore this data set? Here are some ideas of tables/ graphs to help you with your analysis.

16 How is age_difference distributed? What’s the ‘typical’ age_difference in movies?

# select just age_difference and use the skimr package to get summary stats
age_gaps %>% 
  select(age_difference) %>% 
  skimr::skim()
Data summary
Name Piped data
Number of rows 1155
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age_difference 0 1 10.4 8.51 0 4 8 15 52 ▇▃▂▁▁
# histogram
age_gaps %>%
  ggplot()+
  aes(x = age_difference)+
  geom_histogram()+
  theme_bw()

# faceted by character 1 gender 
age_gaps %>%
  ggplot()+
  aes(x = age_difference)+
  geom_histogram()+
  theme_bw()+
  facet_wrap(~character_1_gender, ncol = 1)

17 The half plus seven rule

Large age disparities in relationships carry certain stigmas. One popular rule of thumb is the half-your-age-plus-seven rule. This rule states you should never date anyone under half your age plus seven, establishing a minimum boundary on whom one can date. In order for a dating relationship to be acceptable under this rule, your partner’s age must be:

\[\frac{\text{Your age}}{2} + 7 < \text{Partner Age} < (\text{Your age} - 7) * 2\]

How frequently does this rule apply in this dataset?

age_gaps <- age_gaps %>% 
  mutate(
    lower_limit = actor_1_age/2 + 7,
    half_plus_seven = ifelse(actor_2_age >= lower_limit , 
                             TRUE, 
                             FALSE))

age_gaps %>% 
  count(half_plus_seven) %>% 
  mutate(perc = n/sum(n))
## # A tibble: 2 × 3
##   half_plus_seven     n  perc
##   <lgl>           <int> <dbl>
## 1 FALSE             326 0.282
## 2 TRUE              829 0.718

18 Which movie has the greatest number of love interests? Waht was the median age difference for these movies?

age_gaps %>% 
  group_by(movie_name) %>% 
  summarise(n = n(),
            median_age_difference = median(age_difference, na.rm=TRUE)) %>% 
  arrange(desc(n))
## # A tibble: 830 × 3
##    movie_name                      n median_age_difference
##    <chr>                       <int>                 <dbl>
##  1 Love Actually                   7                  13  
##  2 The Family Stone                6                   4  
##  3 A View to a Kill                5                  28  
##  4 He's Just Not That Into You     5                   3  
##  5 Mona Lisa Smile                 5                   3  
##  6 A Star Is Born                  4                  10  
##  7 American Pie                    4                   5  
##  8 Boogie Nights                   4                   6.5
##  9 Closer                          4                   7  
## 10 Pushing Tin                     4                  11.5
## # ℹ 820 more rows

19 Which actors/ actresses have the greatest number of love interests in this dataset?

age_gaps %>% 
  count(actor_1_name, sort=TRUE)
## # A tibble: 567 × 2
##    actor_1_name          n
##    <chr>             <int>
##  1 Keanu Reeves         24
##  2 Adam Sandler         18
##  3 Roger Moore          17
##  4 Sean Connery         15
##  5 Harrison Ford        13
##  6 Johnny Depp          12
##  7 Pierce Brosnan       12
##  8 Leonardo DiCaprio    11
##  9 Richard Gere         10
## 10 Brad Pitt             9
## # ℹ 557 more rows
age_gaps %>% 
  count(actor_2_name, sort=TRUE)
## # A tibble: 647 × 2
##    actor_2_name           n
##    <chr>              <int>
##  1 Keira Knightley       13
##  2 Scarlett Johansson    13
##  3 Emma Stone            11
##  4 Renee Zellweger       11
##  5 Drew Barrymore         9
##  6 Jennifer Lawrence      9
##  7 Julia Roberts          9
##  8 Amanda Seyfried        8
##  9 Audrey Hepburn         8
## 10 Jennifer Aniston       8
## # ℹ 637 more rows

20 Is the median age difference staying constant over the years (1935 - 2022)?

age_gaps %>% 
  group_by(release_year) %>% 
  summarise(median_difference = median(age_difference, na.rm = TRUE)) %>% 
  ggplot()+
  aes(x=release_year, y = median_difference) + 
  geom_line()+
  geom_smooth(se=FALSE)+
  theme_light()+
  theme(text=element_text(size=12, family="Montserrat"))

21 How frequently does Hollywood depict same-gender love interests?

age_gaps <- age_gaps %>% 
  mutate(same_gender = 
           ifelse(character_1_gender == character_2_gender, 
                  TRUE, 
                  FALSE))

age_gaps %>% 
  count(same_gender) %>% 
  mutate(prop = n/sum(n))
## # A tibble: 2 × 3
##   same_gender     n   prop
##   <lgl>       <int>  <dbl>
## 1 FALSE        1132 0.980 
## 2 TRUE           23 0.0199
age_gaps %>% 
  ggplot()+
  aes(x = release_year, y = age_difference, colour = character_1_gender)+
  geom_point(alpha = 0.6)+
  facet_wrap(~same_gender, ncol=1)+
  theme_bw()+
  theme(text=element_text(size=12, family="Montserrat"))+
  labs(
    colour = "Protagonist's gender"
  )

22 same-gender over time

age_gaps %>% 
  group_by(release_year,same_gender) %>% 
  summarise( n = n()) %>% 
  mutate(prop = n/sum(n)) %>% 
  
  filter(same_gender == FALSE) %>% 
  mutate(same_gender_prop = 1 - prop) %>% 
  ggplot()+
  aes(x=release_year, y = same_gender_prop)+
  
  geom_line()+
  scale_y_continuous(labels = scales::percent)+
  theme_light()

23 Interactive exploration

library(plotly)

plot <- ggplot(age_gaps)+
  geom_point(aes(x=actor_1_age, 
                 y = actor_2_age, 
                 colour = character_1_gender, 
                 
                 # tooltip by default shows all variables used in aes()
                 # to add extra text in tooltip, we use text 
                 text = paste(actor_1_name, "and", actor_2_name, "@",movie_name,"(",release_year,")")), alpha = 0.8)+
  
  # add a line in steelblue that shows lower limit of half plus 7 rule
  geom_line(aes(x = actor_1_age, y = lower_limit),  colour = "steelblue", size = 1.5)+
  theme_light()+
  facet_wrap(~same_gender, ncol=1)


ggplotly(plot)

24 Most protagonists are men below 40 years

base_plot <- age_gaps %>% 
  ggplot()+
  aes(x=actor_1_age, 
      fill = character_1_gender)+
  theme_light()+
  labs(
    title = "Most protagonists are men; majority < 40 years",
    x = "Actors' age",
    fill = "Protagonist's Gender",
    y = NULL
  )+
  theme(text=element_text(size=12, family="Montserrat"))
  
  
base_plot + geom_histogram(alpha = 0.7)

base_plot + geom_density(alpha = 0.6)

base_plot + geom_boxplot(alpha = 0.7)  

base_plot + stat_ecdf(aes(colour = character_1_gender), geom = "step", pad = FALSE) +
  scale_y_continuous(labels = scales::percent)